Feature Engineering and Dataset Creation


In [11]:
from __future__ import division
from IPython.display import display
from matplotlib      import pyplot as plt
%matplotlib inline

import numpy  as np
import pandas as pd
import random, sys, os

SEED   = 97
scale  = False 
minmax = False
norm   = False
nointercept = False
engineering = True

In [12]:
def load_blood_data(train=True, SEED=97, scale  = False, 
                                         minmax = False,
                                         norm   = False,
                                         nointercept = False,
                                         engineering = False):
    """
    Load training and test datasets
    for DrivenData's Predict Blood Donations warmup contest
    
    The training data is shuffled before it's returned; test data is not
    
    Note: patsy returns float64 data; Theano requires float32 so conversion
          will be required; the y values are converted to int32, so they're OK
    
    Arguments
    ---------
        train (bool) if True
                         y_train, X_train = load_blood_data(train=True, ...
                     if False
                         X_test, IDs = load_blood_data(train=False, ...
                         
        SEED (int)   random seed
        
        scale (bool) if True, scale the data to mean zero, var 1; standard normal
        
        minmax (2-tuple) to scale the data to a specified range, provide a
                         2-tuple (min, max)
                         
        norm (bool)  if True, L2 normalize for distance and similarity measures
        
        nointercept (bool) if True, patsy will not create an intercept
                         
                         
    Usage
    -----
    from load_blood_data import load_blood_data
    """
    from sklearn.utils         import shuffle
    from patsy                 import dmatrices, dmatrix
    from sklearn.preprocessing import StandardScaler
    from sklearn.preprocessing import MinMaxScaler
    from sklearn.preprocessing import Normalizer
    import numpy  as np
    import pandas as pd
    import re
    
    global scaler
    global minmaxer
    global normalizer
    
    if (scale and minmax): raise ValueError("cannot specify both scale and minmax")
    if (scale and norm):   raise ValueError("cannot specify both scale and norm")
    if (norm  and minmax): raise ValueError("cannot specify both norm and minmax")
        
    if type(train) is not bool: raise ValueError("train must be boolean")
    if type(SEED)  is not int:  raise ValueError("SEED must be int")
    if type(scale) is not bool: raise ValueError("scale must be boolean")
    if type(norm)  is not bool: raise ValueError("norm must be boolean")
    if type(nointercept) is not bool: raise ValueError("nointercept must be boolean")
    if type(engineering) is not bool: raise ValueError("engineering must be boolean")
    
    # ------------- read the file -------------
    
    file_name = '../input/train.csv' if train else '../input/test.csv'
    data = pd.read_csv(file_name)
    
    
    # ------------- shorten the column names -------------
    
    column_names = ['ID','moSinceLast','numDonations','volume','moSinceFirst','donated']
    data.columns = column_names if train else column_names[:-1]
    
    
    # ------------- create new variables -------------
    
    if engineering:
        # Ratio of moSinceLast / moSinceFirst = moRatio
        data['moRatio'] = pd.Series(data.moSinceLast / data.moSinceFirst, index=data.index)
    
        # Ratio of (volume/numDonations) / moSinceFirst = avgDonation
        data['avgDonation'] = pd.Series((data.volume/data.numDonations) / data.moSinceFirst, index=data.index)
    
        # Ratio of moSinceFirst / numDonations = avgWait
        data['avgWait'] = pd.Series(data.moSinceFirst / data.numDonations, index=data.index)

        
    # ------------- scale the data -------------

    # transform data to mean zero, unit variance
    # ==========================================
    if scale:
        if train:
            scaler = StandardScaler(copy=True, with_mean=True, with_std=True)
            exclude = ['ID','donated']
            data.ix[:, data.columns.difference(exclude)] = scaler.fit_transform(
                data.ix[:, data.columns.difference(exclude)].values.astype(np.float32))
        else:
            exclude = ['ID','donated']
            data.ix[:, data.columns.difference(exclude)] = scaler.transform(
                data.ix[:, data.columns.difference(exclude)].values.astype(np.float32))
            
    # transform data to fit in a range
    # ================================
    if minmax:
        if len(minmax) != 2: raise ValueError("minmax must be a 2-tuple")
        if train:
            minmaxer = MinMaxScaler(feature_range = minmax)
            exclude = ['ID','donated']
            data.ix[:, data.columns.difference(exclude)] = minmaxer.fit_transform(
                data.ix[:, data.columns.difference(exclude)].values.astype(np.float32))
        else:
            exclude = ['ID','donated']
            data.ix[:, data.columns.difference(exclude)] = minmaxer.transform(
                data.ix[:, data.columns.difference(exclude)].values.astype(np.float32))
            
    # transform data to unit vector (L2 norm for distance and similarity)
    # ===================================================================
    if norm:
        if train:
            normalizer = Normalizer(norm='l2', copy=True)
            exclude = ['ID','donated']
            data.ix[:, data.columns.difference(exclude)] = normalizer.fit_transform(
                data.ix[:, data.columns.difference(exclude)].values.astype(np.float32))
        else:
            exclude = ['ID','donated']
            data.ix[:, data.columns.difference(exclude)] = normalizer.transform(
                data.ix[:, data.columns.difference(exclude)].values.astype(np.float32))
        
        
    # ------------- create the design matrix -------------
        
    # create the datasets with a patsy formula
    formula = 'donated ~ moSinceLast * moSinceFirst +  numDonations + volume'
    
    if engineering:
        formula = formula + ' + moRatio + avgDonation + avgWait'
        
    if nointercept: 
        formula = formula + ' -1'
        
    if not train:
        match = re.search(r"~\s??(.*)", formula)
        if match:
            formula = match.group(1)
        else:
            raise ValueError("Patsy formula {} does not match the expected format".format(formula))
            
            
    # ------------- return the values -------------
            
    if train:
        y_train, X_train = dmatrices(formula, data=data, return_type="dataframe")
        y_train = np.ravel(y_train).astype(np.int32)
        
        X_train, y_train = shuffle(X_train, y_train, random_state=SEED)
        return y_train, X_train
    else:
        X_test = dmatrix(formula, data=data, return_type="dataframe")
        IDs    = data.ID.values
        return X_test, IDs

In [13]:
y_train, X_train = load_blood_data(train=True)
display(pd.DataFrame(X_train).head())

y_train, X_train = load_blood_data(train=True, SEED   = SEED, 
                                               scale  = scale, 
                                               minmax = minmax,
                                               norm   = norm,
                                               nointercept = nointercept,
                                               engineering = engineering)
display(pd.DataFrame(X_train).head())
display(pd.DataFrame(y_train).head())


Intercept moSinceLast moSinceFirst moSinceLast:moSinceFirst numDonations volume
41 1 2 2 4 2 500
422 1 4 16 64 4 1000
168 1 4 4 16 1 250
232 1 2 70 140 4 1000
55 1 0 4 0 2 500
Intercept moSinceLast moSinceFirst moSinceLast:moSinceFirst numDonations volume moRatio avgDonation avgWait
41 1 2 2 4 2 500 1.000000 125.000000 1.0
422 1 4 16 64 4 1000 0.250000 15.625000 4.0
168 1 4 4 16 1 250 1.000000 62.500000 4.0
232 1 2 70 140 4 1000 0.028571 3.571429 17.5
55 1 0 4 0 2 500 0.000000 62.500000 2.0
0
0 0
1 0
2 0
3 0
4 0

In [14]:
data = X_train

In [15]:
X_test, IDs = load_blood_data(train=False, SEED   = SEED, 
                                           scale  = scale, 
                                           minmax = minmax,
                                           norm   = norm,
                                           nointercept = nointercept,
                                           engineering = engineering)
X_test.head()


Out[15]:
Intercept moSinceLast moSinceFirst moSinceLast:moSinceFirst numDonations volume moRatio avgDonation avgWait
0 1 2 52 104 12 3000 0.038462 4.807692 4.333333
1 1 21 38 798 7 1750 0.552632 6.578947 5.428571
2 1 4 4 16 1 250 1.000000 62.500000 4.000000
3 1 11 38 418 11 2750 0.289474 6.578947 3.454545
4 1 4 34 136 12 3000 0.117647 7.352941 2.833333

In [ ]: